This metric is the share of people who are poor in a county who live in census tracts with poverty rates over 40%. If a county’s overall poverty rate is 20% but people in poverty are spread out evenly across all census tracts, the index would equal 0; if they were heavily concentrated in certain tracts, the index would approach 1.
Process:
All numbers come for the Census API. The documentation for the Census API is available here. We pull all of the race/ethnicity counts for 2018 using library(censusapi). Note: This will require a Census API key. Add the key to census_api_key-template.R and then delete then delete “template”. It is sourced above.
To do this we have to identify census tracts with poverty rates over 40% in each county, count the number of residents in those tracts who are poor, sum that up and divided it by the total number of poor residents in the county.
options(scipen = 999)
library(tidyverse)
library(censusapi)
library(urbnthemes)
library(reactable)
set_urbn_defaults(style = "print")
source(here::here("06_neighborhoods", "R", "census_api_key.R"))
source(here::here("06_neighborhoods", "R", "get_vars.R"))
https://api.census.gov/data/2018/acs/acs5/variables.html
vars <- c( # Estimate!!Total!!Income in the past 12 months below poverty level
"B00001_001E", # UNWEIGHTED SAMPLE COUNT OF THE POPULATION
#"B01001_001E", # SEX BY AGE
"B17001_001E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (Total)
"B17001_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE
"B17001_002M",
#"B17001A_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE)
#"B17001A_002M",
"B17001B_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)
"B17001B_002M",
"B17001C_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)
"B17001C_002M",
"B17001D_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (ASIAN ALONE)
"B17001D_002M",
"B17001E_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)
"B17001E_002M",
"B17001F_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (SOME OTHER RACE ALONE)
"B17001F_002M",
"B17001G_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (TWO OR MORE RACES)
"B17001G_002M",
"B17001H_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE, NOT HISPANIC OR LATINO)
"B17001H_002M",
"B17001I_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (HISPANIC OR LATINO)
"B17001I_002M"
)
# pull Census tracts for 2018
state_fips <- paste0("state:", unique(urbnmapr::states$state_fips))
tracts <- map_df(state_fips, ~getCensus(name = "acs/acs5",
vars = vars,
region = "tract:*",
regionin = .x,
vintage = 2018)) %>%
as_tibble()
# rename the variables
tracts <- tracts %>%
rename(
people = B17001_001E,
poverty = B17001_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE
poverty_moe = B17001_002M,
#poverty_white = B17001A_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE)
#poverty_white_moe = B17001A_002M,
poverty_black = B17001B_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)
poverty_black_moe = B17001B_002M,
poverty_aian = B17001C_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)
poverty_aian_moe = B17001C_002M,
poverty_asian = B17001D_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (ASIAN ALONE)
poverty_asian_moe = B17001D_002M,
poverty_pacific = B17001E_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)
poverty_pacific_moe = B17001E_002M,
poverty_other = B17001F_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (SOME OTHER RACE ALONE)
poverty_other_moe = B17001F_002M,
poverty_twoplus = B17001G_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (TWO OR MORE RACES)
poverty_twoplus_moe = B17001G_002M,
poverty_white_nonhispanic = B17001H_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE, NOT HISPANIC OR LATINO)
poverty_white_nonhispanic_moe = B17001H_002M,
poverty_hispanic = B17001I_002E, # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (HISPANIC OR LATINO)
poverty_hispanic_moe = B17001I_002M
)
Some tracts don’t have any population. We drop those tracts.
tracts <- tracts %>%
tidylog::filter(people > 0)
There was a data collection error in Rio Arriba County, NM (source). We drop these observations.
tracts <- tracts %>%
tidylog::filter(state != "35" | county != "039")
Check the number of people. It should be around 314,943,184.
tracts %>%
summarize(sum(people))
## # A tibble: 1 x 1
## `sum(people)`
## <dbl>
## 1 314904186
We need to combine the small groups into a group for other races and ethnicities. The Census Bureau typically only posts cross tabs for up to two variables. This requires race, ethnicity, and poverty status so the resulting groups are not disjoint.
knitr::include_graphics(here::here("06_neighborhoods", "www", "images", "race.png"))
Combine the race/ethnicity groups into the group of interest.
tracts <- tracts %>%
mutate(
poverty_other_races =
poverty_aian +
poverty_asian +
poverty_pacific +
poverty_other +
poverty_twoplus
) #%>%
#select(-poverty_aian, -poverty_asian, -poverty_pacific, -poverty_other, -poverty_twoplus)
This Census presentation recommends using the maximum margin of error when aggregating multiple zero estimates.
One way this approximation can differ from the actual MOE is if you were aggregating multiple zero estimates. In this case, the approximate MOE could diverge from the actual margin of error. And so the - our recommendation is to only include one zero estimate margin of error and include the largest one.
# pivot the point estimates
values <- tracts %>%
select(state,
county,
tract,
poverty_aian,
poverty_asian,
poverty_pacific,
poverty_other,
poverty_twoplus) %>%
pivot_longer(c(-state, -county, -tract), names_to = "group", values_to = "value")
# pivot the margins of error
moes <- tracts %>%
select(state,
county,
tract,
poverty_aian_moe,
poverty_asian_moe,
poverty_pacific_moe,
poverty_other_moe,
poverty_twoplus_moe) %>%
pivot_longer(c(-state, -county, -tract), names_to = "group", values_to = "moe") %>%
mutate(group = str_replace(group, "_moe", ""))
# combine the point estimates and margins of error
other_moe <- left_join(values, moes, by = c("state", "county", "tract", "group"))
rm(moes, values)
# keep MOE for non-zero estimates and keep the largest MOE for zero estimates
other_moe <- other_moe %>%
group_by(state, county, tract) %>%
mutate(moe_rank = row_number(desc(moe))) %>%
mutate(moe_rank = if_else(value == 0, moe_rank, 5L)) %>%
mutate(moe_rank = ifelse(moe_rank == min(moe_rank), moe_rank, 0L)) %>%
filter(value != 0 | moe_rank != 0) %>%
select(-moe_rank)
# combine the margins of error
other_moe <- other_moe %>%
summarize(poverty_other_races_moe = sqrt(sum(moe ^ 2))) %>%
ungroup()
# append to the original data set
tracts <- left_join(tracts, other_moe, by = c("state", "county", "tract"))
tracts <- tracts %>%
select(state,
county,
tract,
people,
poverty,
poverty_black,
poverty_hispanic,
poverty_other_races,
poverty_white_nonhispanic,
poverty_moe,
poverty_black_moe,
poverty_hispanic_moe,
poverty_other_races_moe,
poverty_white_nonhispanic_moe)
Look at the margins of error. A large share of the Other Races and Ethnicities have coefficients of variation greater than 0.4.
tracts %>%
summarize(mean(poverty_other_races_moe / poverty_other_races > 0.4))
## # A tibble: 1 x 1
## `mean(poverty_other_races_moe/poverty_other_races > 0.4)`
## <dbl>
## 1 0.984
Let’s look at the margins of error in relation to the counts of people in each race/ethnicity category in each county. Observations to the upper left of the black line have coefficients of variation in excess of 0.4.
tracts %>%
ggplot(aes(poverty_black, poverty_black_moe)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(title = "Most Black Estimates Have Large MOE",
subtitle = "Line represents a CV of 0.4") +
coord_equal() +
scatter_grid()
tracts %>%
ggplot(aes(poverty_hispanic, poverty_hispanic_moe)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(title = "Most Hispanic Estimates Have Large MOE",
subtitle = "Line represents a CV of 0.4") +
coord_equal() +
scatter_grid()
tracts %>%
ggplot(aes(poverty_other_races, poverty_other_races_moe)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(title = "Most Other Races and Ethnicities Estimates Have Large MOE",
subtitle = "Line represents a CV of 0.4") +
coord_equal() +
scatter_grid()
tracts %>%
ggplot(aes(poverty_white_nonhispanic, poverty_white_nonhispanic_moe)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(title = "Most White, non-Hispanic Estimates Have Large MOE",
subtitle = "Line represents a CV of 0.4") +
coord_equal() +
scatter_grid()
As mentioned earlier, these race/ethnicity groups are not disjoint. Accordingly, summing the groups will result in population counts that exceed the population. It will also result in poverty counts that are inflated.
tracts %>%
mutate(poverty_summed = poverty_black + poverty_hispanic + poverty_other_races + poverty_white_nonhispanic) %>%
ggplot(aes(poverty, poverty_summed)) +
geom_point(alpha = 0.2, size = 1) +
coord_equal() +
labs(title = "The Counts Are Unequal because the Groups Aren't Disjoint") +
scatter_grid()
We turn the count of people in poverty into a rate.
tracts <- tracts %>%
mutate(poverty_rate = poverty / people)
stopifnot(min(tracts$poverty_rate) >= 0)
stopifnot(max(tracts$poverty_rate) <= 1)
We calculate the rate of poverty in high-poverty tracts.
tracts <- tracts %>%
mutate(
high_poverty = if_else(poverty_rate > 0.4, poverty, 0),
high_poverty_black = if_else(poverty_rate > 0.4, poverty_black, 0),
high_poverty_hispanic = if_else(poverty_rate > 0.4, poverty_hispanic, 0),
high_poverty_other_races = if_else(poverty_rate > 0.4, poverty_other_races, 0),
high_poverty_white_nonhispanic = if_else(poverty_rate > 0.4, poverty_white_nonhispanic, 0)
)
We calculate the overall poverty and the number of people without a poverty estimate and then sum to the county level.
counties_summary <- tracts %>%
group_by(state, county) %>%
summarize(
people = sum(people),
tracts = n(),
# poverty
poverty = sum(poverty),
poverty_black = sum(poverty_black),
poverty_hispanic = sum(poverty_hispanic),
poverty_other_races = sum(poverty_other_races),
poverty_white_nonhispanic = sum(poverty_white_nonhispanic),
# high poverty
high_poverty = sum(high_poverty),
high_poverty_black = sum(high_poverty_black),
high_poverty_hispanic = sum(high_poverty_hispanic),
high_poverty_other_races = sum(high_poverty_other_races),
high_poverty_white_nonhispanic = sum(high_poverty_white_nonhispanic),
# MOE
poverty_moe = sqrt(sum(poverty_moe ^ 2)),
poverty_black_moe = sqrt(sum(poverty_black_moe ^ 2)),
poverty_hispanic_moe = sqrt(sum(poverty_hispanic_moe ^ 2)),
poverty_other_races_moe = sqrt(sum(poverty_other_races_moe ^ 2)),
poverty_white_nonhispanic_moe = sqrt(sum(poverty_white_nonhispanic_moe ^ 2))
) %>%
ungroup()
counties_summary <- counties_summary %>%
mutate(poverty_rate = poverty / people)
We pull in the county-level data and compare it to the calculated county-level data. The poverty rates should be identical; however, they may differ from numbers published elsewhere (like here) that use Small-Area Income and Poverty Estimates (SAIPE).
counties_test <- map_df(state_fips, ~getCensus(name = "acs/acs5",
vars = vars,
region = "county:*",
regionin = .x,
vintage = 2018)) %>%
as_tibble()
counties_test <- counties_test %>%
mutate(poverty_rate_test = B17001_002E / B17001_001E) %>%
select(state, county, poverty_rate_test) %>%
arrange(state, county)
stopifnot(
counties_summary %>%
select(state, county, poverty_rate) %>%
left_join(counties_test, by = c("state", "county")) %>%
filter(poverty_rate != poverty_rate_test) %>%
nrow() == 0
)
rm(counties_test)
We need the conditional logic to deal with division by zero. If there is no poverty then poverty exposure is zero.
counties_summary <- counties_summary %>%
mutate(
poverty_exposure = high_poverty / poverty,
poverty_exposure_black =
if_else(condition = poverty_black > 0,
true = high_poverty_black / poverty_black,
false = 0),
poverty_exposure_hispanic =
if_else(condition = poverty_hispanic > 0,
true = high_poverty_hispanic / poverty_hispanic,
false = 0),
poverty_exposure_other_races =
if_else(condition = poverty_other_races > 0,
true = high_poverty_other_races / poverty_other_races,
false = 0),
poverty_exposure_white_nonhispanic =
if_else(condition = poverty_white_nonhispanic > 0,
true = high_poverty_white_nonhispanic / poverty_white_nonhispanic,
false = 0),
)
stopifnot(
all(map_dbl(counties_summary, ~sum(is.na(.x))) == 0)
)
Let’s look at the highest values.
counties_summary %>%
arrange(desc(poverty_exposure)) %>%
select(state, county, poverty_exposure)
## # A tibble: 3,141 x 3
## state county poverty_exposure
## <chr> <chr> <dbl>
## 1 02 158 1
## 2 13 061 1
## 3 28 055 1
## 4 28 063 1
## 5 46 031 1
## 6 46 095 1
## 7 46 121 1
## 8 46 137 1
## 9 01 011 0.919
## 10 01 105 0.903
## # … with 3,131 more rows
There shouldn’t be any missing values.
stopifnot(
counties_summary %>%
filter(is.na(poverty_exposure)) %>%
nrow() == 0
)
# create a long version of the subgroup data
counties_summary_subgroups <- counties_summary %>%
select(state, county, contains("exposure")) %>%
pivot_longer(c(-state, -county), names_to = "subgroup", values_to = "poverty_exposure") %>%
mutate(subgroup =
recode(
subgroup,
poverty_exposure = "All",
poverty_exposure_black = "Black, Non-Hispanic",
poverty_exposure_hispanic = "Hispanic",
poverty_exposure_other_races = "Other Races and Ethnicities",
poverty_exposure_white_nonhispanic = "White, Non-Hispanic"
)
)
# check the bounds of the poverty exposure metric
stopifnot(min(counties_summary_subgroups$poverty_exposure) >= 0)
stopifnot(max(counties_summary_subgroups$poverty_exposure) <= 1)
The table shows the calculated metrics. Click on the variable columns to sort the table.
counties_summary %>%
ggplot(aes(poverty_exposure)) +
geom_histogram() +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
labs(title = "Most Counties in 2018 Have No Poverty Exposure",
subtitle = "The Distribution of Poverty Exposure")
counties_summary %>%
ggplot(aes(tracts, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
scatter_grid() +
labs(title = "Most Extreme Poverty Exposure Values are for Small Counties",
x = "Number of Tracts in County")
counties_summary %>%
ggplot(aes(people, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
scatter_grid() +
labs(title = "Most Extreme Poverty Exposure Values are for Small Counties",
x = "Population in County")
counties_summary %>%
ggplot(aes(poverty_rate, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
scatter_grid() +
labs(title = "County Poverty Rate and County Poverty Exposure Are Related")
counties_summary_subgroups %>%
filter(subgroup != "All") %>%
ggplot(aes(poverty_exposure)) +
geom_histogram() +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
facet_wrap(~subgroup) +
labs(title = "Most Counties in 2018 Have No Poverty Exposure",
subtitle = "The Distribution of Poverty Exposure")
counties_summary_subgroups_plots <- left_join(counties_summary_subgroups, select(counties_summary, -poverty_exposure), by = c("state", "county"))
counties_summary_subgroups_plots %>%
filter(subgroup!= "All") %>%
ggplot(aes(tracts, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
facet_wrap(~subgroup, nrow = 2) +
scatter_grid() +
labs(title = "Most Extreme Poverty Exposure Values are for Small Counties",
x = "Number of Tracts in County")
counties_summary_subgroups_plots %>%
filter(subgroup!= "All") %>%
ggplot(aes(people, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
facet_wrap(~subgroup) +
scatter_grid() +
labs(title = "Most Extreme Poverty Exposure Values are for Small Counties",
x = "Population in County")
counties_summary_subgroups_plots %>%
filter(subgroup!= "All") %>%
ggplot(aes(poverty_rate, poverty_exposure)) +
geom_point(alpha = 0.2,
size = 1) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))) +
facet_wrap(~subgroup) +
scatter_grid() +
labs(title = "County Poverty Rate and County Poverty Exposure Are Related")
rm(counties_summary_subgroups_plots)
#' Suppress counties
#'
#' @param race The variable for the count in a race/ethnicity group
#' @param exposure The variable name for the exposure index
#' @param threshold The minimum size of the race group to report the exposure index
#'
#' @return
#'
suppress_county <- function(race, exposure, threshold) {
exposure <- if_else(race <= threshold, as.numeric(NA), exposure)
return(exposure)
}
counties_summary %>%
summarize(
all = sum(is.na(poverty_exposure)),
black_nh = sum(is.na(poverty_exposure_black)),
hispanic = sum(is.na(poverty_exposure_hispanic)),
other_nh = sum(is.na(poverty_exposure_other_races)),
white_nh = sum(is.na(poverty_exposure_white_nonhispanic))
)
## # A tibble: 1 x 5
## all black_nh hispanic other_nh white_nh
## <int> <int> <int> <int> <int>
## 1 0 0 0 0 0
counties_summary <- counties_summary %>%
mutate(
# overall
poverty_exposure =
suppress_county(
race = poverty,
exposure = poverty_exposure,
threshold = 30
),
# black
poverty_exposure_black =
suppress_county(
race = poverty_black,
exposure = poverty_exposure_black,
threshold = 30
),
# hispanic
poverty_exposure_hispanic =
suppress_county(
race = poverty_hispanic,
exposure = poverty_exposure_hispanic,
threshold = 30
),
# other races and ethnicities
poverty_exposure_other_races =
suppress_county(
race = poverty_other_races,
exposure = poverty_exposure_other_races,
threshold = 30
),
# white, non-hispanic
poverty_exposure_white_nonhispanic =
suppress_county(
race = poverty_white_nonhispanic,
exposure = poverty_exposure_white_nonhispanic,
threshold = 30
)
)
counties_summary %>%
summarize(
all = sum(is.na(poverty_exposure)),
black_nh = sum(is.na(poverty_exposure_black)),
hispanic = sum(is.na(poverty_exposure_hispanic)),
other_nh = sum(is.na(poverty_exposure_other_races)),
white_nh = sum(is.na(poverty_exposure_white_nonhispanic))
)
## # A tibble: 1 x 5
## all black_nh hispanic other_nh white_nh
## <int> <int> <int> <int> <int>
## 1 6 1006 546 426 14
We need to add data quality flags with 1, 2, or 3. The values are outlined in the data standards.
1 - If the county coefficient of variation for the count in the group is less than 0.22 - If the county coefficient of variation for the count in the group is less than 0.43 - If the county coefficient of variation for the count in the group exceeds 0.4NA - If the metric is missingcounties_summary <- counties_summary %>%
mutate(
poverty_cv = poverty_moe / poverty,
poverty_black_cv = poverty_black_moe / poverty_black,
poverty_hispanic_cv = poverty_hispanic_moe / poverty_hispanic,
poverty_other_races_cv = poverty_other_races_moe / poverty_other_races,
poverty_white_nonhispanic_cv = poverty_white_nonhispanic_moe / poverty_white_nonhispanic
)
counties_summary %>%
filter(poverty_cv >= 0.4) %>%
ggplot(aes(poverty, poverty_cv, color = poverty <= 30)) +
geom_point(alpha = 0.2) +
labs(title = "The Worst CVs Will be Dropped for n <= 30",
subtitle = "poverty <= 30 in yellow") +
scatter_grid()
counties_summary %>%
filter(poverty_black_cv >= 0.4) %>%
ggplot(aes(poverty_black, poverty_black_cv, color = poverty_black <= 30)) +
geom_point(alpha = 0.2) +
labs(title = "Black, non-Hispanic: The Worst CVs Will be Dropped for n <= 30",
subtitle = "poverty_black <= 30 in yellow") +
scatter_grid()
counties_summary %>%
filter(poverty_hispanic_cv >= 0.4) %>%
ggplot(aes(poverty_hispanic, poverty_hispanic_cv, color = poverty_hispanic <= 30)) +
geom_point(alpha = 0.2) +
labs(title = "Hispanic: The Worst CVs Will be Dropped for n <= 30",
subtitle = "poverty_hispanic <= 30 in yellow") +
scatter_grid()
counties_summary %>%
filter(poverty_other_races_cv >= 0.4) %>%
ggplot(aes(poverty_other_races, poverty_other_races_cv, color = poverty_other_races <= 30)) +
geom_point(alpha = 0.2) +
labs(title = "Other Races and Ethnicities: The Worst CVs Will be Dropped for n <= 30",
subtitle = "poverty_other_races <= 30 in yellow") +
scatter_grid()
counties_summary %>%
filter(poverty_white_nonhispanic_cv >= 0.4) %>%
ggplot(aes(poverty_white_nonhispanic, poverty_white_nonhispanic_cv, color = poverty_white_nonhispanic <= 30)) +
geom_point(alpha = 0.2) +
labs(title = "White, non_hispanic: The Worst CVs Will be Dropped for n <= 30",
subtitle = "white_nh <= 30 in yellow") +
scatter_grid()
#' Assign a data quality flag
#'
#' @param race A vector of counts of a race/ethnicity group within a county
#' @param exposure A race/ethnicity exposure metric
#'
#' @return A numeric data quality flag
#'
set_quality <- function(cv, exposure) {
quality <- case_when(
cv < 0.2 ~ 1,
cv < 0.4 ~ 2,
cv >= 0.4 ~ 3
)
quality <- if_else(is.na(exposure), as.numeric(NA), quality)
return(quality)
}
counties_summary <- counties_summary %>%
mutate(
poverty_exposure_quality = set_quality(cv = poverty_cv, exposure = poverty_exposure),
poverty_exposure_black_quality = set_quality(cv = poverty_black_cv, exposure = poverty_exposure_black),
poverty_exposure_hispanic_quality = set_quality(cv = poverty_hispanic_cv, exposure = poverty_exposure_hispanic),
poverty_exposure_other_races_quality = set_quality(cv = poverty_other_races_cv, exposure = poverty_exposure_other_races),
poverty_exposure_white_nonhispanic_quality = set_quality(cv = poverty_white_nonhispanic_cv, exposure = poverty_exposure_white_nonhispanic)
)
count(counties_summary, poverty_exposure_quality)
## # A tibble: 4 x 2
## poverty_exposure_quality n
## <dbl> <int>
## 1 1 2204
## 2 2 810
## 3 3 121
## 4 NA 6
count(counties_summary, poverty_exposure_black_quality)
## # A tibble: 4 x 2
## poverty_exposure_black_quality n
## <dbl> <int>
## 1 1 449
## 2 2 596
## 3 3 1090
## 4 NA 1006
count(counties_summary, poverty_exposure_hispanic_quality)
## # A tibble: 4 x 2
## poverty_exposure_hispanic_quality n
## <dbl> <int>
## 1 1 306
## 2 2 617
## 3 3 1672
## 4 NA 546
count(counties_summary, poverty_exposure_other_races_quality)
## # A tibble: 4 x 2
## poverty_exposure_other_races_quality n
## <dbl> <int>
## 1 1 367
## 2 2 651
## 3 3 1697
## 4 NA 426
count(counties_summary, poverty_exposure_white_nonhispanic_quality)
## # A tibble: 4 x 2
## poverty_exposure_white_nonhispanic_quality n
## <dbl> <int>
## 1 1 1749
## 2 2 1117
## 3 3 261
## 4 NA 14
Most of the counties with missing values are very small.
missing <- counties_summary %>%
filter(
is.na(poverty_exposure) |
is.na(poverty_exposure_black) |
is.na(poverty_exposure_hispanic) |
is.na(poverty_exposure_other_races) |
is.na(poverty_exposure_white_nonhispanic)
)
max(missing$people)
## [1] 121504
max(missing$tracts)
## [1] 21
We need to include all counties in the published data even if we don’t have a metric for the county. We load the county file and join our metrics to the county file.
# load the 2018 county file
all_counties <- read_csv(here::here("geographic-crosswalks", "data", "county-file.csv")) %>%
filter(year == 2018)
final_data <- left_join(all_counties, counties_summary, by = c("state", "county")) %>%
select(year,
state,
county,
poverty_exposure)
write_csv(final_data,
here::here("06_neighborhoods", "poverty-exposure", "poverty-exposure.csv"))
counties_summary_subgroups <- counties_summary_subgroups %>%
mutate(subgroup_type = if_else(subgroup == "All", "all", "race-ethnicity"))
final_data_race_ethnicity <- left_join(all_counties, counties_summary_subgroups, by = c("state", "county")) %>%
select(year,
state,
county,
subgroup_type,
subgroup,
poverty_exposure)
write_csv(final_data_race_ethnicity,
here::here("06_neighborhoods", "poverty-exposure", "poverty-exposure_race-ethnicity.csv"))